Our dataset is made up of a random sample of 500 Airbnb listings from Chicago taken from all listings in Chicago from August 2008 to May 2017. The dataset contains attributes for each of the listings such as review scores, features of the listing (eg number of rooms, bathrooms, etc.), prices, location (coordinates, neighborhood), services provided (heat, wifi), rules (whether smoking or pets are allowed). The data was scraped by Professor Laura Ziegler of the ISU statistics department.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
airbnb = read.csv("airbnb.csv")
str(airbnb)
## 'data.frame': 500 obs. of 63 variables:
## $ id : int 14843458 722570 8153968 18327990 12673204 10344726 5737004 16826474 5414283 8610501 ...
## $ listing_url : Factor w/ 500 levels "https://www.airbnb.com/rooms/10043106",..: 131 431 453 316 56 4 391 243 381 464 ...
## $ name : Factor w/ 500 levels "#61 King Arthurs Court",..: 177 302 191 410 119 124 151 193 179 17 ...
## $ summary : Factor w/ 492 levels "","- Townhome 150 yds (2 min walk) from McCormick Place - Free Parking - Huge Room with queen bed (and futon) - Pr"| __truncated__,..: 124 1 283 421 463 126 280 292 323 175 ...
## $ space : Factor w/ 366 levels "","- This is a great, two bedroom, one bathroom home on the 1st floor of a four unit building. The first bedroom h"| __truncated__,..: 250 78 349 1 176 107 170 6 365 51 ...
## $ description : Factor w/ 499 levels "- Townhome 150 yds (2 min walk) from McCormick Place - Free Parking - Huge Room with queen bed (and futon) - Pr"| __truncated__,..: 127 117 286 427 470 129 283 295 326 178 ...
## $ neighborhood_overview : Factor w/ 331 levels "","------------------------------------------------------------------------------ SUPER CONVENIENT LOCATION!!! ---"| __truncated__,..: 102 1 119 1 86 186 187 3 82 220 ...
## $ notes : Factor w/ 225 levels "","- Any short-term stays of 3 or lesser nights (over a weekend) may be assessed a surcharge",..: 100 1 211 1 143 92 1 3 73 1 ...
## $ transit : Factor w/ 346 levels "","- 1 min walk to the Lawrence Red Line station - 1 mile from Metra station (commuter rail) - Divvy Bike station "| __truncated__,..: 227 1 269 1 59 100 300 9 131 252 ...
## $ access : Factor w/ 315 levels "","#94 is the door code. How to receive keys will be arranged ahead of time.",..: 194 1 312 1 163 297 1 52 172 22 ...
## $ interaction : Factor w/ 313 levels "","24/7 access to the host via phone, text, or email.",..: 301 1 300 1 121 178 1 223 103 198 ...
## $ house_rules : Factor w/ 318 levels "","-- Not handicap accessible (there are 59 stairs - no elevator!)",..: 107 151 41 225 98 176 267 236 55 143 ...
## $ host_id : int 20653807 3731751 16500117 6903096 34473759 45766549 29751294 111964625 2768284 37417772 ...
## $ host_url : Factor w/ 461 levels "https://www.airbnb.com/users/show/100027760",..: 106 205 82 369 185 264 156 33 143 206 ...
## $ host_name : Factor w/ 365 levels "Aama","Aamir",..: 170 211 314 117 206 327 111 208 68 218 ...
## $ host_since : Factor w/ 413 levels "01/02/2016","01/02/2017",..: 281 319 170 181 161 323 80 15 199 205 ...
## $ host_location : Factor w/ 31 levels "Barrington, Illinois, United States",..: 16 7 7 26 7 7 7 7 7 7 ...
## $ host_about : Factor w/ 319 levels "","\"Depth and breadth are crucial to creativity.\" -Adam M Grant \n\nChicago native. Buckeye nation.",..: 314 295 122 192 173 309 1 292 34 12 ...
## $ host_response_rate : Factor w/ 32 levels "0%","100%","33%",..: 2 2 2 2 2 2 32 2 2 10 ...
## $ host_is_superhost : Factor w/ 2 levels "f","t": 1 1 2 1 2 1 1 1 2 1 ...
## $ host_neighbourhood : Factor w/ 61 levels "","Albany Park",..: 1 28 1 1 28 28 28 28 1 21 ...
## $ host_verifications : Factor w/ 82 levels "['email', 'phone', 'amex', 'reviews', 'kba', 'work_email']",..: 23 71 24 76 71 71 71 26 70 76 ...
## $ host_has_profile_pic : Factor w/ 2 levels "f","t": 2 2 2 2 2 2 2 2 2 2 ...
## $ host_identity_verified : Factor w/ 2 levels "f","t": 2 2 2 1 2 2 2 1 2 1 ...
## $ street : Factor w/ 126 levels "Albany Park, Chicago, IL 60625, United States",..: 31 66 31 31 66 67 67 66 26 56 ...
## $ neighbourhood : Factor w/ 45 levels "Albany Park",..: 20 20 20 20 20 20 20 20 14 14 ...
## $ latitude : num 42 42 42 42 42 ...
## $ longitude : num -87.7 -87.7 -87.7 -87.7 -87.7 ...
## $ is_location_exact : Factor w/ 2 levels "f","t": 1 2 1 1 2 2 2 2 1 2 ...
## $ room_type : Factor w/ 3 levels "Entire home/apt",..: 1 1 2 1 1 1 1 1 2 1 ...
## $ accommodates : int 3 4 4 2 3 3 2 6 2 4 ...
## $ bathrooms : num 1 2 1 1 1 1 1 2 1 1 ...
## $ bedrooms : int 1 2 1 1 2 0 1 3 1 1 ...
## $ beds : int 1 2 2 1 2 2 1 4 1 2 ...
## $ bed_type : Factor w/ 5 levels "Airbed","Couch",..: 5 5 5 5 5 5 5 5 5 1 ...
## $ amenities : Factor w/ 495 levels "{\"Air conditioning\",Kitchen,\"Pets allowed\",Heating,\"Family/kid friendly\",\"Smoke detector\",\"Carbon mono"| __truncated__,..: 428 151 189 12 375 92 68 155 187 436 ...
## $ price : int 69 139 65 80 150 83 85 85 59 120 ...
## $ monthly_price : int NA NA 1600 NA NA 695 NA NA 1470 NA ...
## $ security_deposit : int NA 300 NA NA NA NA NA 150 NA NA ...
## $ cleaning_fee : int 20 80 15 NA 35 NA NA 59 20 NA ...
## $ guests_included : int 1 3 2 1 1 2 1 2 2 4 ...
## $ price_extra_people : int 48 20 10 0 0 10 0 15 0 25 ...
## $ maximum_nights : int 14 21 1125 1125 1125 1125 1125 1125 31 1125 ...
## $ calendar_updated : Factor w/ 22 levels "1 week ago","12 months ago",..: 19 5 5 5 5 19 16 21 15 8 ...
## $ availability_30 : int 6 1 16 0 0 0 0 9 8 11 ...
## $ availability_60 : int 7 1 43 0 8 3 0 15 36 23 ...
## $ availability_90 : int 13 5 73 0 19 16 0 15 61 30 ...
## $ availability_365 : int 13 280 73 129 19 73 0 46 332 305 ...
## $ review_scores_cleanliness : int 9 10 10 NA 10 10 10 8 10 9 ...
## $ review_scores_communication: int 10 10 10 NA 10 10 10 10 10 10 ...
## $ review_scores_value : int 10 10 10 NA 10 10 10 10 10 9 ...
## $ instant_bookable : Factor w/ 2 levels "f","t": 1 1 1 2 1 1 1 1 1 1 ...
## $ cancellation_policy : Factor w/ 3 levels "flexible","moderate",..: 1 2 2 1 2 2 2 3 2 1 ...
## $ reviews_per_month : num 0.96 0.29 2.61 NA 0.43 2.28 0.16 6.32 3.47 1.58 ...
## $ cable_tv : Factor w/ 2 levels "No","Yes": 1 2 2 1 1 1 1 2 2 1 ...
## $ wireless_internet : Factor w/ 2 levels "No","Yes": 2 2 2 1 2 2 2 2 2 2 ...
## $ kitchen : Factor w/ 2 levels "No","Yes": 2 2 2 1 2 2 2 2 2 2 ...
## $ pets_allowed : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 2 ...
## $ breakfast : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 2 1 1 2 1 ...
## $ heating : Factor w/ 2 levels "No","Yes": 2 2 2 1 2 2 1 2 2 2 ...
## $ X24.hour_checkin : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 1 2 2 2 ...
## $ smoking_allowed : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 1 1 1 ...
## $ smoking_allowed.2 : int 0 0 0 0 0 0 1 0 0 0 ...
# Setting descriptions, URLS, to character types
airbnb_char = airbnb %>% mutate_at(c(2:12, 14, 18, 25), as.character)
# Set numerics where they're needed
library(stringr)
airbnb_char$host_response_rate = as.numeric(str_remove(airbnb_char$host_response_rate, "%"))
## Warning: NAs introduced by coercion
# all others should be properly formatted
# Set boolean columns
airbnb_char = airbnb_char %>% mutate_at(c(20, 23, 24, 29, 52, 55:62), as.character) # R won't reassign unless column is of character type
airbnb_char[airbnb_char=="f"] = "false" # true and false are parseable by as.logical
airbnb_char[airbnb_char=="No"] = "false"
airbnb_char[airbnb_char=="t"] = "true"
airbnb_char[airbnb_char=="Yes"] = "true"
airbnb_clean = airbnb_char %>% mutate_at(c(20, 23, 24, 29, 52, 55:62), as.logical)
# boolean values are set to be TRUE and FALSE
# Set missing values to NA
airbnb_clean[airbnb_clean==""] = NA # Note: this should work for both character and factor variables
# Setting dates to date types
airbnb_clean$host_since = as.Date(airbnb_clean$host_since, format="%m/%d/%Y")
# this is done last because it breaks slicing
Now, we can start with some exploration of individual variables. First off, we’ll look at distributions of some simple numerical variables.
library(ggplot2)
# While there are many libraries for this, we can easily whip up a quick summary statistics function
summary_stats = function(variable) {
cat("Total Count: ", length(variable), "\n")
cat("N: ", sum(!is.na(variable)), "\n")
cat("Minimum: ", min(variable, na.rm = TRUE), "\n")
cat("Maximum: ", max(variable, na.rm = TRUE), "\n")
cat("Mean: ", mean(variable, na.rm = TRUE), "\n")
cat("Standard Deviation: ", sd(variable, na.rm = TRUE), "\n")
cat("Median: ", median(variable, na.rm = TRUE), "\n")
cat("Quantiles:", "\n")
print(quantile(as.numeric(variable), c(0, 0.25, 0.5, 0.75, 1), na.rm = TRUE))
cat("IQR: ", IQR(variable, na.rm = TRUE), "\n")
}
Host response rate:
summary_stats(airbnb_clean$host_response_rate)
## Total Count: 500
## N: 480
## Minimum: 0
## Maximum: 100
## Mean: 96.23125
## Standard Deviation: 10.81784
## Median: 100
## Quantiles:
## 0% 25% 50% 75% 100%
## 0 100 100 100 100
## IQR: 0
Given that we have a median equal to our maximum, 100, along with such a high mean and low IQR, we clearly have a majority of hosts having a 100% response rate.
Number of guests accommodated:
summary_stats(airbnb_clean$accommodates)
## Total Count: 500
## N: 500
## Minimum: 1
## Maximum: 16
## Mean: 3.872
## Standard Deviation: 2.600219
## Median: 3
## Quantiles:
## 0% 25% 50% 75% 100%
## 1 2 3 5 16
## IQR: 3
Price:
summary_stats(airbnb_clean$price)
## Total Count: 500
## N: 500
## Minimum: 10
## Maximum: 950
## Mean: 135.416
## Standard Deviation: 125.9147
## Median: 100
## Quantiles:
## 0% 25% 50% 75% 100%
## 10 60 100 155 950
## IQR: 95
We will get four price categories based on our quantiles, which we’ll call very high, high, medium, and low, for later categorization use.
low = quantile(airbnb_clean$price)[2]
medium = quantile(airbnb_clean$price)[3]
high = quantile(airbnb_clean$price)[4]
price_category = vector(length = length(airbnb_clean$price))
lowcheck = sum(airbnb_clean$price <= low)
price_category[airbnb_clean$price <= low] = "low"
print(lowcheck == sum(price_category == "low"))
## [1] TRUE
medcheck = sum(airbnb_clean$price > low & airbnb_clean$price <= medium)
price_category[airbnb_clean$price > low & airbnb_clean$price <= medium] = "medium"
print(medcheck == sum(price_category == "medium"))
## [1] TRUE
hicheck = sum(airbnb_clean$price > medium & airbnb_clean$price <= high)
price_category[airbnb_clean$price > medium & airbnb_clean$price <= high] = "high"
print(hicheck == sum(price_category == "high"))
## [1] TRUE
vcheck = sum(airbnb_clean$price > high)
price_category[airbnb_clean$price > high] = "very high"
print(vcheck == sum(price_category == "very high"))
## [1] TRUE
# All checks seem right, let's see if our distribution checks out:
airbnb_clean$price_category = factor(price_category, levels = c("low", "medium", "high", "very high"))
ggplot(airbnb_clean, aes(price_category)) + geom_bar() + labs(x = "Price Category", y = "Count", title = "Price Category Distributions")
# Looks reasonably even.
Maximum nights:
summary_stats(airbnb_clean$maximum_nights)
## Total Count: 500
## N: 500
## Minimum: 3
## Maximum: 1125
## Mean: 736.986
## Standard Deviation: 518.5947
## Median: 1125
## Quantiles:
## 0% 25% 50% 75% 100%
## 3 30 1125 1125 1125
## IQR: 1095
Note the high proportion of 1125s, probably the maximum allowed for a listing by the website.
Reviews per month:
summary_stats(airbnb_clean$reviews_per_month)
## Total Count: 500
## N: 426
## Minimum: 0.05
## Maximum: 9.75
## Mean: 2.135563
## Standard Deviation: 1.960553
## Median: 1.485
## Quantiles:
## 0% 25% 50% 75% 100%
## 0.050 0.620 1.485 3.245 9.750
## IQR: 2.625
This gives an interesting variety, let’s look at a histogram.
ggplot(airbnb_clean, aes(reviews_per_month)) + geom_histogram() + labs(x = "Reviews Per Month", y = "Count", title = "Reviews Per Month Distribution")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 74 rows containing non-finite values (stat_bin).
We can see a right skew with a floor at 0, so we know most locations aren’t reviewed many times per month.
Cleanliness review score (out of 10):
summary_stats(airbnb_clean$review_scores_cleanliness)
## Total Count: 500
## N: 425
## Minimum: 4
## Maximum: 10
## Mean: 9.477647
## Standard Deviation: 0.8467998
## Median: 10
## Quantiles:
## 0% 25% 50% 75% 100%
## 4 9 10 10 10
## IQR: 1
Communication review score (out of 10):
summary_stats(airbnb_clean$review_scores_communication)
## Total Count: 500
## N: 424
## Minimum: 8
## Maximum: 10
## Mean: 9.856132
## Standard Deviation: 0.4016045
## Median: 10
## Quantiles:
## 0% 25% 50% 75% 100%
## 8 10 10 10 10
## IQR: 0
Value review score (out of 10):
summary_stats(airbnb_clean$review_scores_value)
## Total Count: 500
## N: 422
## Minimum: 6
## Maximum: 10
## Mean: 9.56872
## Standard Deviation: 0.6673718
## Median: 10
## Quantiles:
## 0% 25% 50% 75% 100%
## 6 9 10 10 10
## IQR: 1
Seems like all three review categories seem to tend toward the high end. A left skew is likely. Let’s check:
ggplot(airbnb_clean, aes(review_scores_cleanliness)) + geom_histogram() + labs(x = "Cleanliness Review Score", y = "Count", title = "Cleanliness Review Scores Distribution")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 75 rows containing non-finite values (stat_bin).
ggplot(airbnb_clean, aes(review_scores_communication)) + geom_histogram() + labs(x = "Communication Review Score", y = "Count", title = "Communication Review Scores Distribution")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 76 rows containing non-finite values (stat_bin).
ggplot(airbnb_clean, aes(review_scores_value)) + geom_histogram() + labs(x = "Value Review Score", y = "Count", title = "Value Review Scores Distribution")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 78 rows containing non-finite values (stat_bin).
Unsurprising. People tend to give high reviews in the dataset.
# Let's look at some counts for factors to gauge their plottability
# Unique host names:
length(unique(airbnb_clean$host_name))
## [1] 365
# Unique host locations:
length(unique(airbnb_clean$host_location))
## [1] 31
# Unique host neighborhoods:
length(unique(airbnb_clean$host_neighbourhood))
## [1] 61
# Unique listing neighborhoods:
length(unique(airbnb_clean$neighbourhood))
## [1] 45
# Unique room types:
length(unique(airbnb_clean$room_type))
## [1] 3
# Unique bed types:
length(unique(airbnb_clean$bed_type))
## [1] 5
We can see that some variables have more values than are comfortably put into a bar graph, but we easily look at distributions for room and bed types.
ggplot(airbnb_clean, aes(room_type, fill = price_category)) + geom_bar() + labs(x = "Room Type", y = "Count", title = "Distribution of Room Types")
So, we can see that most rooms are an entire home or apartment, many are private rooms, and very few are shared rooms. Note that the majority of high and very high prices appear in the entire home category, unsurprisingly. Most low prices are in the private room category. While private rooms are not the least luxurious, they are much more common than shared rooms, which have a much higher proportion of low prices within its room category. In summary, higher price means more private space.
ggplot(airbnb_clean, aes(bed_type)) + geom_bar() + labs(x = "Bed Type", y = "Count", title = "Distribution of Bed Types")
The overwhelming majority of beds are real beds, with very few out of the 500 total listings in the data set being airbeds, couches, futons, or pull-out sofas.
# How about distributions of host start times?
ggplot(airbnb_clean, aes(host_since)) + geom_histogram() + labs(x = "Start Date", y = "Count", title = "Distribution of Host Start Dates")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
We have a left skewed distribution, meaning in this context that more currently active hosts started more recently than not. Ie, there has likely either been a sharp rise in hosts in the area over time or hosts tend to host for a short amount of time before quitting, leading to the active hosts beginning more recently. The former is more obvious, but we would need data about inactive hosts to rule out the latter.
Let’s look at some data involving individual hosts.
# A count of unique host id's will tell us how many hosts we have
print(length(unique(airbnb_clean$host_id)))
## [1] 461
# Out of curiosity, does this match the unique host names?
print(length(unique(airbnb_clean$host_name)))
## [1] 365
So, it seems at least one host (almost certainly more) has multiple listings available, given that our dataset has 500 listings. Also, hosts share names.
What are some common names?
repeat_names = (sort(table(airbnb_clean$host_name), decreasing = TRUE))
repeat_names[repeat_names > 1]
##
## Joe Sonder John Paul Justin
## 10 9 7 7 6
## Nick Michael Chris David Jessica
## 6 5 4 4 4
## Laura Lisa Liz Mark Matt
## 4 4 4 4 4
## Mike The Flats Amanda Charles Dan
## 4 4 3 3 3
## Daniel Freehand James Jennifer Maria
## 3 3 3 3 3
## Mario Nicole Sarah Sharon Stephanie
## 3 3 3 3 3
## Tom Ami Andrew Ashley At Home Inn
## 3 2 2 2 2
## Catherine Christopher Dana Emily Frank
## 2 2 2 2 2
## Jane Jeff Jenny Jimmy Kari
## 2 2 2 2 2
## Kate Kim Kristi Leon Liliana
## 2 2 2 2 2
## Lori Mary Megan Mejai Kai Melanie & Joe
## 2 2 2 2 2
## Monica Natalia Pamela Peter Rebecca
## 2 2 2 2 2
## Ross Steve Terry Thomas Trevor
## 2 2 2 2 2
Some of these are obviously common names, eg Joe, John, Paul. Some seem to be a single business with multiple properties, like “At Home Inn” and “The Flats”. Similarly, we find that our second most common host name, “Sonder”, is also such a business with a bit of research.
Let’s look at ratings by name
# First, we make a total rating by adding the three rating types
airbnb_clean = airbnb_clean %>% mutate(review_scores_total = (review_scores_communication + review_scores_cleanliness + review_scores_value))
avg_score_by_name = airbnb_clean %>% group_by(host_name) %>% summarize(avg = mean(review_scores_total))
avg_score_by_name = na.omit(avg_score_by_name)
avg_score_by_name = avg_score_by_name %>% arrange(desc(avg))
head(avg_score_by_name, 15)
## # A tibble: 15 x 2
## host_name avg
## <fct> <dbl>
## 1 Adi 30
## 2 Alan 30
## 3 Alex 30
## 4 Alexander 30
## 5 Ali 30
## 6 Alissa 30
## 7 Amber 30
## 8 Amrit Rania 30
## 9 Amy 30
## 10 Andy 30
## 11 Anjli 30
## 12 Anna-Lisa 30
## 13 Anne 30
## 14 Anne-Marie 30
## 15 April 30
We can see have a lot of 30/30 for total scores, meaning many people got very good ratings as mentioned earlier. Unique, however, are low scores. Let’s look at the bottom three.
tail(avg_score_by_name, 3)
## # A tibble: 3 x 2
## host_name avg
## <fct> <dbl>
## 1 Wilson 22
## 2 Niki 20
## 3 Sam 18
Since these three scores are unique, we can use them as keys for finding more info on the users. Had we used host_id, we could have used that as well, but here we used names for the sake of readability.
sam = airbnb_clean %>% filter(review_scores_total == 18 & host_name == "Sam")
niki = airbnb_clean %>% filter(review_scores_total == 20 & host_name == "Niki")
wilson = airbnb_clean %>% filter(review_scores_total == 22 & host_name == "Wilson")
We can get a quick data frame of our three hosts of interest with host relevant data.
low_rated = rbind(sam, niki, wilson)
Let’s look at their ratings.
# Note: 49, 50, 51 are the review score columns, 15 is host name
low_rated[,c(15, 49:51)]
## host_name review_scores_cleanliness review_scores_communication
## 1 Sam 4 8
## 2 Niki 6 8
## 3 Wilson 6 8
## review_scores_value
## 1 6
## 2 6
## 3 8
So, it seems that having good communication all around, the hosts have average to good value scores and poor to average cleanliness, their weakest factor in general. Let’s see how cleanliness generally compares to other review scores across all hosts.
avg_scores_by_name = airbnb_clean %>% group_by(host_name) %>% summarize(avg_cl = mean(review_scores_cleanliness), avg_com = mean(review_scores_communication), avg_val = mean(review_scores_value) , avg_total = mean(review_scores_total))
avg_scores_by_name = na.omit(avg_scores_by_name)
Now, we can look at summary stats of each rating variable
Cleanliness:
summary_stats(avg_scores_by_name$avg_cl)
## Total Count: 292
## N: 292
## Minimum: 4
## Maximum: 10
## Mean: 9.477645
## Standard Deviation: 0.8622203
## Median: 10
## Quantiles:
## 0% 25% 50% 75% 100%
## 4 9 10 10 10
## IQR: 1
Communication:
summary_stats(avg_scores_by_name$avg_com)
## Total Count: 292
## N: 292
## Minimum: 8
## Maximum: 10
## Mean: 9.860826
## Standard Deviation: 0.3800228
## Median: 10
## Quantiles:
## 0% 25% 50% 75% 100%
## 8 10 10 10 10
## IQR: 0
Value:
summary_stats(avg_scores_by_name$avg_val)
## Total Count: 292
## N: 292
## Minimum: 6
## Maximum: 10
## Mean: 9.585331
## Standard Deviation: 0.6301351
## Median: 10
## Quantiles:
## 0% 25% 50% 75% 100%
## 6 9 10 10 10
## IQR: 1
We can see that thought our base review scores don’t vary too much in mean, cleanliness has the lowest mean (~9.48), the lowest minimum (4), and the highest standard deviation (~0.862). So, there does seem to be a bit of a lower lump in the tail of our distribution (which we saw in the histograms above). Let’s see how many 4s we actually have.
sum(na.omit(airbnb_clean$review_scores_cleanliness) == 4)
## [1] 1
Just one it seems. Though we only have 500 listings, it seems Sam’s is particularly dirty.
We can predict some practical values using linear regression models.
str(airbnb_clean)
fit = lm(price ~bedrooms + bathrooms + accommodates ,data=airbnb_clean)
summary(fit)
##
## Call:
## lm(formula = price ~ bedrooms + bathrooms + accommodates, data = airbnb_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -267.21 -46.23 -18.13 22.81 659.33
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -27.658 11.580 -2.388 0.017291 *
## bedrooms 30.313 8.617 3.518 0.000475 ***
## bathrooms 68.862 10.626 6.480 2.22e-10 ***
## accommodates 7.775 2.813 2.764 0.005916 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 101.6 on 495 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.3544, Adjusted R-squared: 0.3505
## F-statistic: 90.59 on 3 and 495 DF, p-value: < 2.2e-16
So here our F Statistic is 90.59 on 3 and p-value p-value: < 0.0001 indicating that our model is useful for predicting prices but the R-adjusted and R values are significantly lower 0.3505 which is 35.05% of the variability.
The dataset provides useful location data for each listing that can provide some insight into different geographical areas.
library(ggmap)
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
library(RColorBrewer)
bbox = c(-87.8,41.73,-87.5,42.05)
m = get_stamenmap(bbox,zoom=12)
## Source : http://tile.stamen.com/terrain/12/1049/1519.png
## Source : http://tile.stamen.com/terrain/12/1050/1519.png
## Source : http://tile.stamen.com/terrain/12/1051/1519.png
## Source : http://tile.stamen.com/terrain/12/1052/1519.png
## Source : http://tile.stamen.com/terrain/12/1049/1520.png
## Source : http://tile.stamen.com/terrain/12/1050/1520.png
## Source : http://tile.stamen.com/terrain/12/1051/1520.png
## Source : http://tile.stamen.com/terrain/12/1052/1520.png
## Source : http://tile.stamen.com/terrain/12/1049/1521.png
## Source : http://tile.stamen.com/terrain/12/1050/1521.png
## Source : http://tile.stamen.com/terrain/12/1051/1521.png
## Source : http://tile.stamen.com/terrain/12/1052/1521.png
## Source : http://tile.stamen.com/terrain/12/1049/1522.png
## Source : http://tile.stamen.com/terrain/12/1050/1522.png
## Source : http://tile.stamen.com/terrain/12/1051/1522.png
## Source : http://tile.stamen.com/terrain/12/1052/1522.png
## Source : http://tile.stamen.com/terrain/12/1049/1523.png
## Source : http://tile.stamen.com/terrain/12/1050/1523.png
## Source : http://tile.stamen.com/terrain/12/1051/1523.png
## Source : http://tile.stamen.com/terrain/12/1052/1523.png
## Source : http://tile.stamen.com/terrain/12/1049/1524.png
## Source : http://tile.stamen.com/terrain/12/1050/1524.png
## Source : http://tile.stamen.com/terrain/12/1051/1524.png
## Source : http://tile.stamen.com/terrain/12/1052/1524.png
#made a new dataframe so only the listings with exact locations were used in mapping plots (where is_location_exact = TRUE))
airbnb_clean2 = airbnb_clean[airbnb_clean$is_location_exact==TRUE,]
#dropping West Elsdon neighbourhood because there is only one listing in that neighbourhood and it was skewing the data/graphs due to its irregularity
airbnb_clean2 = airbnb_clean2[airbnb_clean2$neighbourhood!='West Elsdon',]
ggmap(m) +
geom_point(data = airbnb_clean2, aes(x = longitude, y = latitude, color=price_category), size=.9, alpha=.75) +
coord_sf(xlim = c(-87.8, -87.5), ylim = c(41.73, 42.05), expand = FALSE)+
labs(title="Price Categories Mapped", y = "Latitude",x = "Longitude", color="Price Category")+
scale_color_brewer(palette="Dark2")
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Warning: Removed 2 rows containing missing values (geom_point).
ggplot(data=airbnb_clean2, aes(x=reorder(neighbourhood,-price), y=price)) +
geom_bar(position = "dodge", stat = "summary", fun.y = "mean", fill='Light Blue', color = 'Red')+
theme(axis.text.x = element_text(angle=90,hjust=0.95,vjust=0.2))+
scale_y_continuous(breaks = seq(0, max(airbnb_clean$price), by = 20))+
labs(title="Neighbourhoods By Mean Price", y = "Mean Price ($)",x = "Neighbourhood")+
stat_summary(aes(label=round(..y..,2)), fun.y=mean, geom="text", size=3, hjust=-0.1, angle=90)
#the labels on each bar were made using code similar to the one from user agstudy on stack over flow here:
#https://stackoverflow.com/questions/20139978/ggplot2-label-values-of-barplot-that-uses-fun-y-mean-of-stat-summary
ggmap(m) +
geom_point(data = airbnb_clean2, aes(x = longitude, y = latitude, color=room_type), size = .9, alpha=.75) +
coord_sf(xlim = c(-87.8, -87.5), ylim = c(41.73, 42.05), expand = FALSE)+
labs(title="Room Types Mapped", y = "Latitude",x = "Longitude", color="Room Type")+
scale_color_brewer(palette="Dark2")
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Warning: Removed 2 rows containing missing values (geom_point).
ggplot(airbnb_clean2, aes(x=reorder(neighbourhood,-price), fill=room_type))+geom_bar(position="fill")+
theme(axis.text.x = element_text(angle=90,hjust=0.95,vjust=0.2))+
labs(title="Neighbourhood Room Types Ordered by Mean Price Descending", y = "Room Type %",x = "Neighbourhood", fill="Room Type")+
scale_fill_brewer(palette="Set2")
ggmap(m) +
geom_point(data = airbnb_clean2, aes(x = longitude, y = latitude, color=factor(bedrooms)), size=.9, alpha=.75) +
coord_sf(xlim = c(-87.8, -87.5), ylim = c(41.73, 42.05), expand = FALSE)+
labs(title="# of Bedrooms Mapped", y = "Latitude",x = "Longitude", color="Bedrooms")+
scale_color_brewer(palette="Set1")
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Warning: Removed 3 rows containing missing values (geom_point).
ggplot(data=airbnb_clean2, aes(x=reorder(neighbourhood,-price), y=bedrooms)) +
geom_bar(position = "dodge", stat = "summary", fun.y = "mean", fill='Light Blue', color = 'Red')+
theme(axis.text.x = element_text(angle=90,hjust=0.95,vjust=0.2))+
labs(title="Mean Bedrooms per Neighbourhood Sorted by Mean Price Descending", y = "Mean Bedrooms",x = "Neighbourhood")+
stat_summary(aes(label=round(..y..,2)), fun.y=mean, geom="text", size=2, vjust = -0.5)
## Warning: Removed 1 rows containing non-finite values (stat_summary).
## Warning: Removed 1 rows containing non-finite values (stat_summary).
ggmap(m) +
geom_point(data = airbnb_clean2, aes(x = longitude, y = latitude, color=factor(accommodates)), size=.9, alpha=.75) +
coord_sf(xlim = c(-87.8, -87.5), ylim = c(41.73, 42.05), expand = FALSE)+
labs(title="Accomodations for # of People", y = "Latitude",x = "Longitude", color="# of People")
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Warning: Removed 2 rows containing missing values (geom_point).
ggplot(data=airbnb_clean2, aes(x=reorder(neighbourhood,-price), y=accommodates)) +
geom_bar(position = "dodge", stat = "summary", fun.y = "mean", fill='Light Blue', color = 'Red')+
theme(axis.text.x = element_text(angle=90,hjust=0.95,vjust=0.2))+
scale_y_continuous(breaks = seq(0, max(airbnb_clean$accommodates), by = 1))+
labs(title="Mean # of Guest Accommodations per Neighbourhood Ordered by Mean Price", y = "Mean Accommodations",x = "Neighbourhood")+
stat_summary(aes(label=round(..y..,2)), fun.y=mean, geom="text", size=3, angle=90, hjust=-.11)
Several of the variables here come in the form of long text descriptions from which useful information can be pulled with some effort. [Kaleb]
library(stringi)
downtown <- stri_detect_fixed(airbnb_clean$name, "Downtown")
modern <- stri_detect_fixed(airbnb_clean$name, "Modern")
charming <- stri_detect_fixed(airbnb_clean$name, "Charming")
cozy <- stri_detect_fixed(airbnb_clean$name, "Cozy")
Now we will look to see if there are any cases where the price is generally higher if a certain word is included:
Downtown:
ggplot(airbnb_clean, aes(downtown, price)) +geom_point()
Modern:
ggplot(airbnb_clean, aes(modern, price)) +geom_point()
Charming:
ggplot(airbnb_clean, aes(charming, price)) +geom_point()
Cozy:
ggplot(airbnb_clean, aes(cozy, price)) +geom_point()
Let’s see if the effect of having access to certain types of transportation show up in terms of a price change
bus <- stri_detect_fixed(airbnb_clean$transit,"bus")
ggplot(airbnb_clean, aes(bus, price)) + geom_point()
Lets compare that to entries that contain “car” :
car <- stri_detect_fixed(airbnb_clean$transit, "car")
ggplot(airbnb_clean, aes(car, price)) + geom_point()
We should take a look at the length of the title and see what that can tell us:
L.name <- stri_length(airbnb_clean$name)
ggplot(airbnb_clean, aes(L.name, price)) + geom_point()
Let’s do the same thing with the summary and description:
L.summary <- stri_length(airbnb_clean$summary)
ggplot(airbnb_clean, aes(L.summary, price)) + geom_point()
## Warning: Removed 7 rows containing missing values (geom_point).
L.description <- stri_length(airbnb_clean$description)
ggplot(airbnb_clean, aes(L.description, price)) + geom_point()